Pancreas TRAs

# ----------------------------------------------------------
# TRAs 
# ----------------------------------------------------------

# set working directory to folder with TRA tables (rawdata/TRA Daten)
projectPath <- dirname(rstudioapi::getSourceEditorContext()$path)
setwd(paste(projectPath, "rawdata", "TRA Daten", sep = "/"))

# read in all 6 tables
TRAs1_human <-read_csv("Human_protein_atlas_TRA_5median_genes_annotated.tsv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Ensembl_human_genes = col_character(),
##   Chromosome = col_character(),
##   Startsite = col_double(),
##   Strand = col_double(),
##   Band = col_character(),
##   Symbol = col_character(),
##   EntrezGene = col_double(),
##   Unigene = col_character(),
##   Tissue_number = col_double(),
##   Tissues = col_character(),
##   Max_tissue = col_character()
## )
TRAs2_human <-read_tsv("tra.2014.human.5x.table.tsv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   ensembl.transcript = col_character(),
##   ensembl.gene = col_character(),
##   gene.symbol = col_character(),
##   entrezID = col_character(),
##   refseqID = col_character(),
##   unigeneID = col_character(),
##   chrom = col_character(),
##   startsite = col_double(),
##   tiss.number = col_double(),
##   tissues = col_character(),
##   max.tissue = col_character()
## )
TRAs3_human <-read_tsv("tra.2014.human.roth.5x.table.tsv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   ensembl.transcript = col_character(),
##   ensembl.gene = col_character(),
##   gene.symbol = col_character(),
##   entrezID = col_character(),
##   refseqID = col_character(),
##   unigeneID = col_character(),
##   chrom = col_character(),
##   startsite = col_double(),
##   tiss.number = col_double(),
##   tissues = col_character(),
##   max.tissue = col_character()
## )
TRAs4_mouse <-read_tsv("tra.2014.mouse.5x.table.tsv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   ensembl.transcript = col_character(),
##   ensembl.gene = col_character(),
##   gene.symbol = col_character(),
##   entrezID = col_character(),
##   refseqID = col_character(),
##   unigeneID = col_character(),
##   chrom = col_character(),
##   startsite = col_double(),
##   tiss.number = col_double(),
##   tissues = col_character(),
##   max.tissue = col_character()
## )
TRAs5_mouse <-read_tsv("tra.2014.mouse.4301.5x.table.tsv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   ensembl.transcript = col_character(),
##   ensembl.gene = col_character(),
##   gene.symbol = col_character(),
##   entrezID = col_character(),
##   refseqID = col_character(),
##   unigeneID = col_character(),
##   chrom = col_character(),
##   startsite = col_double(),
##   tiss.number = col_double(),
##   tissues = col_character(),
##   max.tissue = col_character()
## )
TRAs6_human <-read_tsv("tra.2017.human.gtex.5x.table.tsv",col_types = cols(ensembl.chrom=col_character()))
#col_types to correct the parsing failure

PancreasTRAs1_human <- TRAs1_human[grep(x=TRAs1_human$Max_tissue,"panc"),]
PancreasTRAs2_human <- TRAs2_human[grep(x=TRAs2_human$max.tissue,"Panc"),]
PancreasTRAs3_human <- TRAs3_human[grep(x=TRAs3_human$max.tissue,"Panc"),]
PancreasTRAs4_mouse <- TRAs4_mouse[grep(x=TRAs4_mouse$max.tissue,"panc"),]
PancreasTRAs5_mouse <- TRAs5_mouse[grep(x=TRAs5_mouse$max.tissue,"panc"),]
PancreasTRAs6_human <- TRAs6_human[grep(x=TRAs6_human$max.tissue,"Panc"),]

# create character vectors with pancreas specific genes (for both humans & mice)
pancreas_gene_human <- unique(c(PancreasTRAs1_human$Symbol,PancreasTRAs2_human$gene.symbol,PancreasTRAs6_human$ensembl.symbol))
TRA_pancreas_genes_human <- pancreas_gene_human

# all TRAs
TRA_genes_human <- unique(c(TRAs1_human$Symbol,TRAs2_human$gene.symbol,TRAs6_human$ensembl.symbol))

Creating expression matrix

# ----------------------------------------------------------
# Create vsnrma normalized expression matrix
# ----------------------------------------------------------

set.seed(234) # with this vsnrma gives out the same values each time

# get CEL files
setwd(paste(projectPath, "rawdata", "rawdata breast GSE27830", sep = "/"))
data_breast <- ReadAffy()

# change cdf
data_breast@cdfName <- "HGU133Plus2_Hs_ENST"

# create expression matrix
breast_matrix <- exprs(data_breast)

# store colnames (microarray)
breast_microarray_information <- colnames(breast_matrix)

# vsnrma normalization
breast_vsnrma <- vsnrma(data_breast)
## Calculating Expression
# expression matrix of vsnrm normalized data set
breast_vsnrma_matrix <- exprs(breast_vsnrma)

# cut rownames at "." 
rownames(breast_vsnrma_matrix) <- str_replace(rownames(breast_vsnrma_matrix),"\\..*","") 

# remove ending .CEL from colnames 
colnames(breast_vsnrma_matrix) <- str_remove(colnames(breast_vsnrma_matrix),"_.CEL")

# remove control probes (AFFX) 
breast_vsnrma_matrix <- breast_vsnrma_matrix[which(!startsWith(rownames(breast_vsnrma_matrix), "AFFX")),]

# transcript IDs
breast_transcript_names <- rownames(breast_vsnrma_matrix)


# ----------------------------------------------------------
# Create data frames which hold information about microarrays
# ----------------------------------------------------------

# create data frame with information about microarray samples

breast_microarrays <- data.frame(number = substr(breast_microarray_information, 1,9),
                                 row.names = c(1:10))

# replace old IDs sample names
colnames(breast_matrix) <- breast_microarrays[,"number"]
colnames(breast_vsnrma_matrix) <- breast_microarrays[,"number"]

# ----------------------------------------------------------
# Convert transcript IDs to gene symbols
# ----------------------------------------------------------

# get table for converting transcript IDs to gene symbols
setwd(paste(projectPath, "rawdata", "tables" ,sep = "/"))
annotation <- read.csv("ensemble.103.txt") 
ensemble_transcripts <- annotation[,"Transcript.stable.ID"]
ensemble_symbol <- annotation[,"HGNC.symbol"]
# name each gene symbol by their respective transcripts
names(ensemble_symbol) <- ensemble_transcripts
# select only those transcripts that are also present in the annotation
breast_GeneExprs <- breast_vsnrma_matrix[rownames(breast_vsnrma_matrix) %in% ensemble_transcripts,]

# create vector with the symbols of the transcript from expression matrix
breast_symbol <- ensemble_symbol[rownames(breast_GeneExprs)]

# use gene symbols instead of transcripts as rownames
rownames(breast_GeneExprs) <- as.character(breast_symbol)

# order genes alphabetically & remove all those rows where "" is the rowname 
breast_GeneExprs <- breast_GeneExprs[order(rownames(breast_GeneExprs)),][1097:nrow(breast_GeneExprs),] 

# replace colnames with the GSM number of the microarrays
colnames(breast_GeneExprs) <- breast_microarrays[,"number"]


# ----------------------------------------------------------
# Breast cancer specific TRAs
# ----------------------------------------------------------

# create a expression matrix with only those genes
breast_GeneExprs_sub <- breast_GeneExprs[which(rownames(breast_GeneExprs) %in% TRA_pancreas_genes_human),]

# ----------------------------------------------------------
# TRAs for all tissues
# ----------------------------------------------------------

breast_GeneExprs_allTRA <- breast_GeneExprs[which(rownames(breast_GeneExprs) %in% TRA_genes_human),]

# ----------------------------------------------------------
# combining transcripts for same gene via median
# ----------------------------------------------------------

combineGeneExprs_median2 <- function(exprsmatrix){
  # create data frame with a column of all the gene symbols
  data <- data.frame(geneSymbols = rownames(exprsmatrix),
                     exprsmatrix, 
                     row.names = NULL)
  # group data frame by gene symbols and combine values of one gene by median
  combined <- aggregate(data[-1], by = list(data$geneSymbols) , FUN = median)
  # convert data frame to matrix (only expression values, no row names)
  result <- as.matrix(combined[,-1])
  # use gene symbols as row names
  rownames(result) <- combined[,1]
  # output
  result
}

breast_GeneExprs_combined <- combineGeneExprs_median2(breast_GeneExprs)
breast_GeneExprs_sub_combined <- combineGeneExprs_median2(breast_GeneExprs_sub)

Quality control

# ----------------------------------------------------------
# Single chip control
# ----------------------------------------------------------

#setwd(paste(projectPath, "plots", sep = "/"))

for (i in 1:10){
  image(data_breast[,i], col = rainbow (100, start = 0, end = 0.75)[100:1])
  file.name = paste(as.character(breast_microarrays[, "number"])[i],".pdf", sep = "")
  #dev.copy2pdf(file = file.name)
}

## The chips do not appear to be faulty.

# ----------------------------------------------------------
# Pheno Data
# ----------------------------------------------------------

pData(data_breast)
##               sample
## GSM687017.CEL      1
## GSM687018.CEL      2
## GSM687019.CEL      3
## GSM687020.CEL      4
## GSM687021.CEL      5
## GSM687022.CEL      6
## GSM687023.CEL      7
## GSM687024.CEL      8
## GSM687025.CEL      9
## GSM687026.CEL     10
# ----------------------------------------------------------
# MeanSdPlot
# ----------------------------------------------------------

meanSdPlot(breast_vsnrma, plot = FALSE)$gg + theme(aspect.ratio = 1)

#setwd(paste(projectPath, "plots", sep = "/"))
#dev.copy2pdf(file = "breast_meanSdPlot_vsnrma_normalized.pdf")

## Red line is mostly horizontal. Therefore, mean - standard deviation have no dependence. However, there is a slight upwards trend of the standard deviation.

# ----------------------------------------------------------
# RNA degradation plot: breast data set
# ----------------------------------------------------------

RNAdeg_breast <- AffyRNAdeg(data_breast)

# Shift and scale
par(pty = "s", mai = c(1.1,0.5,0.5,0.1))
plotAffyRNAdeg(RNAdeg_breast, col = rainbow(10))
title(sub = "Breast Cancer Rawdata")

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_rnadeg_rawdata.pdf")

# Shift
par(pty = "s", mai = c(1.1,0.5,0.5,0.1))
plotAffyRNAdeg(RNAdeg_breast, col = rainbow(10), transform = "shift.only")
title(sub = "Breast Cancer Scaled")

#dev.copy2pdf(file = "breast_rnadeg_shift_rawdata.pdf")

## No crossing of lines was detected. The quality of the chips is good.

# ----------------------------------------------------------
# Histogram before and after normalization
# ----------------------------------------------------------

## Before normalization
par(mai = c(0.9,0.9,0.9,0.5))

hist(data_breast, 
     col = rainbow(10), 
     main = "Density function of log intensity (data set: GSE27830, Foekens et al.)\nbefore normalization")

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_hist_rawdata.pdf")

## After normalization
par(mai = c(0.9,0.9,0.9,0.5))

plot(density(breast_vsnrma_matrix), 
     type = "n", xlab = "log Intensity", ylim = c(0, 0.6),
     main = "Density function of log intensity (data set: GSE27830, Foekens et al.)\nafter normalization")

for (i in 1:ncol(breast_vsnrma_matrix)){
  lines(density(breast_vsnrma_matrix[,i]), col = rainbow(10)[i])
}

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_hist_vsnrma_normalized.pdf")

# ----------------------------------------------------------
# Boxplot before and after normalization
# ----------------------------------------------------------

## Before normalization
par(las = 2, mai = c(1, 0.8, 0.7, 0.1)) 

boxplot(data_breast, names = breast_microarrays[, "number"], 
        col = rainbow(10),  ylab = "Intensity values",
        main = "Gene expression in breast cancer before vsnrma normalization\ndata set: GSE27830 (Foekens et al.)", 
        cex.axis=0.8)

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_boxplot_rawdata.pdf")

## After normalization

par(las = 2, mai = c(1, 0.8, 0.7, 0.1)) 

boxplot(breast_vsnrma_matrix, names = breast_microarrays[, "number"], 
        col = rainbow(10), ylab = "Intensity values",
        main = "Gene expression in breast cancer after vsnrma normalization\ndata set: GSE27830 (Foekens et al.)", 
        cex.axis=0.8)

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_boxplot_vsnrma_normalized.pdf")

# ----------------------------------------------------------
# Scatterplots
# ----------------------------------------------------------

#setwd(paste(projectPath, "plots" ,sep = "/"))
par(mai = c(0.9, 0.9, 0.7, 0.3))

for (i in 1:9) {
  for (j in (i+1):10){
    
    plot(breast_vsnrma_matrix[,i], breast_vsnrma_matrix[,j], pch = ".",
       xlab = breast_microarrays[, "number"][i], 
       ylab = breast_microarrays[, "number"][j])
    abline(0, 1, col = "red")
    
    title(main = paste("Scatterplot of probe",
                     breast_microarrays[, "number"] [i],
                     "and", 
                     breast_microarrays[, "number"] [i+1],
                     sep = " ", collapse = NULL))
    
    file.name = paste("Scatterplot_",
                    as.character(breast_microarrays[, "number"] [i], 
                    "_", as.character(breast_microarrays[, "number"][i+1]), ".pdf", 
                    sep =" "))
    #dev.copy2pdf(file = file.name)
    
  }
}

## The scatterplots never show an abnormal distribution.

Descriptive Statistics

# ----------------------------------------------------------
# Boxplots containing all TRA genes present in breast cancer
# ----------------------------------------------------------

# Five boxplots are created, each counting 50 genes, to ensure better visibility and accessibility of each gene.

par(las = 2, 
    cex.axis = 0.5, cex.main = 0.8,
    mai = c(0.65,0.4,0.4,0.1), mfrow = c(2,1))

for (i in seq(1,250,by = 50)){
  
  generange = paste0("(", rownames(breast_GeneExprs_sub_combined)[i],
                     " - ", 
                     rownames(breast_GeneExprs_sub_combined)[i+49],")")
  
  boxplot(t(breast_GeneExprs_sub_combined[i:(i+49),filter(breast_microarrays)$number]),
          col = rainbow(50), 
          main= paste0("Boxplot of TRA gene expression in breast cancer ", generange, "\ndata set: GSE27830 (Foekens et al.)"),
          horizontal = FALSE)
  abline(h = median(apply(breast_vsnrma_matrix, 1, median)))
  
  filename = paste0("breast_geneexpression_boxplot_", 
                    rownames(breast_GeneExprs_sub_combined)[i],"_",
                    rownames(breast_GeneExprs_sub_combined)[i+49],".pdf")
  print(filename)
  setwd(paste(projectPath, "plots", sep = "/"))
  # dev.copy2pdf(file = filename)
}
## [1] "breast_geneexpression_boxplot_AAK1_CPB1.pdf"

## [1] "breast_geneexpression_boxplot_CRACR2B_GRB10.pdf"
## [1] "breast_geneexpression_boxplot_GRK4_PAFAH2.pdf"

## [1] "breast_geneexpression_boxplot_PAK3_SEL1L.pdf"
## [1] "breast_geneexpression_boxplot_SERPINA3_ZNF780A.pdf"

# ----------------------------------------------------------
# Heatmaps
# ----------------------------------------------------------

filenames <- rownames(pData(data_breast)) 
breast_samples <- substr(filenames, 1, 9)

# Heatmap of the gene expression in breast cancer.
pheatmap(t(breast_GeneExprs_sub), 
         labels_row = paste(breast_samples),
         labels_col = rep("",nrow(breast_GeneExprs_sub)),
         main = "Gene expression in breast cancer \ndata set: GSE27830 (Foekens et al.)",
         cluster_rows = TRUE, 
         cluster_cols = TRUE,
         fontsize_row = 8, fontsize = 10)

# Gene expression of breast cancer after combining.
pheatmap(t(breast_GeneExprs_sub_combined), 
         labels_row = paste(breast_samples),
         labels_col = rep("",nrow(breast_GeneExprs_sub_combined)),
         main = "Gene expression in breast cancer after combination \ndata set: GSE27830 (Foekens et al.)",
         cluster_rows = TRUE, 
         cluster_cols = TRUE,
         fontsize_row = 8, fontsize = 10)

Analysis

## Mean of entire set
mean(breast_GeneExprs_sub_combined)
## [1] 7.241704
## 7.241704

## Median of entire set
median(breast_GeneExprs_sub_combined)
## [1] 6.766971
## 6.766971

## Standard deviation of entire set
sd(breast_GeneExprs_sub_combined)
## [1] 1.41852
## 1.41852
# ----------------------------------------------------------
# High variance
# ----------------------------------------------------------

## Calculate variance of rows (genes)
var_breast <- apply(breast_GeneExprs_sub_combined, 1, var, na.rm=TRUE)

## Create function to filter out the top 25% of variance.
top5 <- function(x) {
  x >= quantile(x, 0.75)
}

## Apply function to variance set, to determine the top 5% of variance.
top5_breast <- top5(var_breast)
top5_breast_names <- which(top5_breast == "TRUE")
high_var_breast_sub <- breast_GeneExprs_sub_combined[top5_breast_names, , drop = FALSE]
dim(high_var_breast_sub)
## [1] 63 10
## data set complete: 250 genes
## data set high variance: 63 genes

# Boxplot of 25% high variance genes

par(las = 2)
par(mai = c(0.7,0.4,0.5,0.1))

boxplot(t(high_var_breast_sub), 
        col = rainbow(63), 
        cex.axis = 0.5, 
        main = "Top 25% high variance genes \ndata set: GSE27830 (Foekens et al.)"); 
abline(h = median(breast_GeneExprs_sub_combined), col = "red")

# ----------------------------------------------------------
# Genes with high expression levels 
# ----------------------------------------------------------

# Median
breast_median <- apply(breast_GeneExprs_sub_combined, 1, median, na.rm=TRUE)
median(breast_median)
## [1] 6.754704
## Median: 6.754704

## Search for genes, whose median gene expression are more than 1.5 times that of the median.
which(breast_median > (1.5 * median(breast_median)))
##    CYB5A     GNAS    NUPR1     RPL5     RPL8     RPS9 SERPINA3     SPP1 
##       58       93      143      192      193      194      201      222 
##     SSR4     XBP1 
##      225      246
## Names of genes that fit this criteria: 10 in total.
## CYB5A     GNAS    NUPR1     RPL5     RPL8     RPS9 SERPINA3     SPP1     SSR4     XBP1

## Expression values of those genes:
breast_median_Gene_Exprs <- breast_GeneExprs[which(breast_median > (1.5 * median(breast_median)))]
breast_median_Gene_Exprs
##  [1] 6.690879 8.422208 7.805209 6.782207 5.740733 5.712064 7.076842 6.705710
##  [9] 6.597868 5.970241
# Pancreas TRA with an median expression above the overall median:
medianbreast <- median(breast_GeneExprs_sub_combined)
length(which(apply(breast_GeneExprs_sub_combined,1,median) > medianbreast))
## [1] 124
# 124 genes have a higher expression than the overall median.

# Creating matrix with overexpressed TRAs in decreasing order:
overexpressedbreast <- breast_GeneExprs_sub_combined[which(apply(breast_GeneExprs_sub_combined,1,median)> medianbreast),]
overexpressedbreastordered <- overexpressedbreast[order(apply(overexpressedbreast,1,median),decreasing=TRUE),]

# Boxplot with all the genes that are higher expressed than overall median.
par(las =2, mai = c(0.7,0.6,0.6,0.2))

boxplot(t(overexpressedbreast),
        col = rainbow(124),
        cex.axis = 0.5, 
        main="Breast Cancer gene expression of overexpressed pancreas TRAs \ndata set: GSE27830 (Foekens et al.)")
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))

# Boxplot with the top 25 genes that are higher expressed than the median.
par(las =2, mai = c(1.2,0.6,0.6,0.2))

boxplot(t(overexpressedbreastordered[1:25,]),
         col = rainbow(25),
         cex.axis = 1,
         main="Gene expression of the 25 highest expressed pancreas TRAs \ndata set: GSE27830 (Foekens et al.)")
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))